#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Oct  8 08:56:55 2018
Script for Scraping SBIR Project Pages
Grant A. Allard
10-09-2018
For Use on EC2
@author: GrantAllard
"""

#########Udpated code with list segmentation#############

#Load Libraries
import pandas as pd #Good 
from urllib.request import urlopen #Good 
from bs4 import BeautifulSoup #Installed 
import requests #Good 
import time #Installed
import datetime#Assume it is in base package
import logging
import mysql.connector
import sqlalchemy
from sqlalchemy import create_engine, Table, Column, Integer, String, MetaData, ForeignKey



#Set Logging
logger = logging.getLogger('SBIRscraper')
logfile = logging.FileHandler('/home/ubuntu/logs.log')
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
logfile.setFormatter(formatter)
logger.addHandler(logfile)
logger.setLevel(logging.INFO)

#Read in Excel Data

#Turn Links into data frame
#links_df = pd.DataFrame(data=links)
#links_df.columns = ['links']
#links_df = links_df.iloc[1:,]
#links_df7=links_df.iloc[12027:12627]
#links_df8=links_df.iloc[12627:13627]
#links_df9=links_df.iloc[13627:33627]
#links_df10=links_df.iloc[33627:53627]
#links_df11=links_df.iloc[53627:73627]
#links_df12=links_df.iloc[73627:88674]


#Export to MySQL Server
#engine = create_engine('mysql+mysqlconnector://enterprise_1701:Computer99@starship.cm8hvr9jm2gq.us-east-2.rds.amazonaws.com:3306/borg')
#links_df7.to_sql(name='links_df7', con=engine, schema=None, if_exists= 'replace', index=False)
#links_df8.to_sql(name='links_df8', con=engine, schema=None, if_exists= 'replace', index=False)
#links_df9.to_sql(name='links_df9', con=engine, schema=None, if_exists= 'replace', index=False)
#links_df10.to_sql(name='links_df10', con=engine, schema=None, if_exists= 'replace', index=False)
#links_df11.to_sql(name='links_df11', con=engine, schema=None, if_exists= 'replace', index=False)
#links_df12.to_sql(name='links_df12', con=engine, schema=None, if_exists= 'replace', index=False)
#df_names_list = ['links_df7','links_df8','links_df9','links_df10','links_df11','links_df12']



#####Start of EC2 Script 
#Import from MySQL 
engine = create_engine('enter info')
links_df10 = pd.read_sql_table('links_df10', con=engine)
#adjust code for missing records
#links_df# = links_df#.iloc[456:1001]
links_list=links_df10['links'].tolist()


#Web Scrape
#Development Web Scrape#
cols = ['URL', 'Title', 'Address', 'DUNS', 'Info', 'Contact', 'Amount']
data=pd.DataFrame(columns=cols)

#Set User Agent Header
headers = requests.utils.default_headers()
headers

headers.update( 
    {
    'User-Agent': 'Grant A. Allard, Doctoral Student at Clemson. <gallard@clemson.edu>. Web Scraper R2D2 Bot. 30 second pause in crawl.'    
        }
    )

#Establish start time
starttime = datetime.datetime.now()
print(starttime)

#####Updated Loop####

loop_starttime = (datetime.datetime.now())
loop_starttime = loop_starttime.strftime('%m-%d-%Y.%H:%M:%S')
loop_starttime
logger.info('Crawl began')
print('Crawl began', loop_starttime )

#Loop
#logger.info('Started scraping loop')
for i in links_list:
        starttime = (datetime.datetime.now())
        starttime = starttime.strftime('%m-%d-%Y.%H:%M:%S')
        print('search began', starttime)
        url = i
        print(i)
        logger.info('search began'+ ' '+ url)
        html = urlopen(url)
        r = requests.get(url, headers=headers)
        html_doc = r.text
        soup = BeautifulSoup(html_doc, 'lxml')
        Title = soup.find("div", {"sbc-name-wrapper"}).get_text().replace('\n','')
        Address = soup.find("div", {"sbc-address-wrapper"}).get_text().replace('\n','')
        DUNS = soup.find("div", {"col-md-12"}).get_text().replace('\n','')
        Info = soup.find("div", {"small-business-info-wrapper"}).get_text().replace('\n', ' ')
        Contact = soup.find("div", attrs={'class': 'row award-sub-wrapper'}).get_text().replace('\n','')
        Amount = soup.find("div", attrs={'class': 'col-md-3'}).get_text().replace('\n','')
        d = {'URL': url, 'Title': Title, 'Address': Address, 'DUNS':DUNS,'Info': Info, 'Contact': Contact, 'Amount': Amount }
        #data = data.append(d, ignore_index=True )
        finish = (datetime.datetime.now())
        finish = finish.strftime('%m-%d-%Y.%H:%M:%S')
        logger.info('search complete'+ ' ' + url)
        print('search complete', finish)
        #Store Data
        data_df = pd.DataFrame(d, index=[0])
        sendtime = (datetime.datetime.now())
        sendtime = sendtime.strftime('%m-%d-%Y.%H:%M:%S')
        #Send to SQL
        df= data_df
        engine = create_engine('mysql+mysqlconnector://enterprise_1701:Computer99@starship.cm8hvr9jm2gq.us-east-2.rds.amazonaws.com:3306/borg')
        df.to_sql(name='DUNS_df10', con=engine, if_exists= 'append', index=False)
        print('Send to sql complete', sendtime)
        time.sleep(29.9)
 
        

    
logger.info('Complete scraping loop')
loop_endtime = (datetime.datetime.now())
loop_endtime = loop_endtime.strftime('%m-%d-%Y.%H:%M:%S')
print('Crawl complete', loop_endtime)
